library(MASS)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
Using the lm() function to fit a simple linear regression model, with medv as the response and lstat as the predictor.
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
attach(Boston)
lm.fit <- lm(medv ~ lstat)
summary(lm.fit)
##
## Call:
## lm(formula = medv ~ lstat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
Accessing information within lm.fit
names(lm.fit)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit)
## (Intercept) lstat
## 34.5538409 -0.9500494
confint(lm.fit)
## 2.5 % 97.5 %
## (Intercept) 33.448457 35.6592247
## lstat -1.026148 -0.8739505
The predict() function can be used to produce confidence intervals and prediction intervals for the prediction of medv for a given value of lstat
predict(lm.fit, data.frame(lstat = (c(5, 10, 15))), interval = "confidence")
## fit lwr upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
predict(lm.fit, data.frame(lstat = (c(5, 10, 15))), interval = "prediction")
## fit lwr upr
## 1 29.80359 17.565675 42.04151
## 2 25.05335 12.827626 37.27907
## 3 20.30310 8.077742 32.52846
Plotting medv and lstat along with the least squares regression line using the plot() and abline() functions
plot(lstat, medv, pch = "+")
abline(lm.fit, lwd = 3, col = "red")
plot (1:20, 1:20, pch = 1:20)
par(mfrow = c(2 , 2))
plot(predict(lm.fit), residuals(lm.fit))
plot(predict(lm.fit), rstudent(lm.fit))
plot (hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
## 375
## 375
The syntax lm(y ∼ x1 + x2 + x3) is used to fit a model with three predictors, x1, x2, and x3
lm.fit = lm(medv ~ lstat + age, data = Boston)
summary(lm.fit)
##
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.981 -3.978 -1.283 1.968 23.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
## lstat -1.03207 0.04819 -21.416 < 2e-16 ***
## age 0.03454 0.01223 2.826 0.00491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
## F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
Using all 12 variables as predictors
lm.fit = lm(medv ~ ., data = Boston)
summary(lm.fit)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1304 -2.7673 -0.5814 1.9414 26.2526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.617270 4.936039 8.431 3.79e-16 ***
## crim -0.121389 0.033000 -3.678 0.000261 ***
## zn 0.046963 0.013879 3.384 0.000772 ***
## indus 0.013468 0.062145 0.217 0.828520
## chas 2.839993 0.870007 3.264 0.001173 **
## nox -18.758022 3.851355 -4.870 1.50e-06 ***
## rm 3.658119 0.420246 8.705 < 2e-16 ***
## age 0.003611 0.013329 0.271 0.786595
## dis -1.490754 0.201623 -7.394 6.17e-13 ***
## rad 0.289405 0.066908 4.325 1.84e-05 ***
## tax -0.012682 0.003801 -3.337 0.000912 ***
## ptratio -0.937533 0.132206 -7.091 4.63e-12 ***
## lstat -0.552019 0.050659 -10.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.798 on 493 degrees of freedom
## Multiple R-squared: 0.7343, Adjusted R-squared: 0.7278
## F-statistic: 113.5 on 12 and 493 DF, p-value: < 2.2e-16
We want to remove age as a predictor due to its high p-value.
lm.fit1 <- update(lm.fit, ~ . - age)
lstat * age includes lstat, age, and the interaction term lstat × age as predictors
summary(lm(medv ~ lstat * age, data = Boston))
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
Performing a regression of medv onto lstat and lstat^2
lm.fit2 <- lm(medv ~ lstat + I(lstat^2))
summary(lm.fit2)
##
## Call:
## lm(formula = medv ~ lstat + I(lstat^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2834 -3.8313 -0.5295 2.3095 25.4148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.862007 0.872084 49.15 <2e-16 ***
## lstat -2.332821 0.123803 -18.84 <2e-16 ***
## I(lstat^2) 0.043547 0.003745 11.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
## F-statistic: 448.5 on 2 and 503 DF, p-value: < 2.2e-16
Using anova() to see how much better our new quadratic fit is compared to the linear fit
lm.fit <- lm(medv ~ lstat)
anova(lm.fit, lm.fit2)
## Analysis of Variance Table
##
## Model 1: medv ~ lstat
## Model 2: medv ~ lstat + I(lstat^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 19472
## 2 503 15347 1 4125.1 135.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2 , 2))
plot(lm.fit2)
Trying a fifth order polynomial transformation
lm.fit5 <- lm(medv ~ poly(lstat, 5))
summary(lm.fit5)
##
## Call:
## lm(formula = medv ~ poly(lstat, 5))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5433 -3.1039 -0.7052 2.0844 27.1153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2318 97.197 < 2e-16 ***
## poly(lstat, 5)1 -152.4595 5.2148 -29.236 < 2e-16 ***
## poly(lstat, 5)2 64.2272 5.2148 12.316 < 2e-16 ***
## poly(lstat, 5)3 -27.0511 5.2148 -5.187 3.10e-07 ***
## poly(lstat, 5)4 25.4517 5.2148 4.881 1.42e-06 ***
## poly(lstat, 5)5 -19.2524 5.2148 -3.692 0.000247 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared: 0.6817, Adjusted R-squared: 0.6785
## F-statistic: 214.2 on 5 and 500 DF, p-value: < 2.2e-16
Trying a log transformation
summary(lm(medv ~ log(rm), data = Boston))
##
## Call:
## lm(formula = medv ~ log(rm), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.487 -2.875 -0.104 2.837 39.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -76.488 5.028 -15.21 <2e-16 ***
## log(rm) 54.055 2.739 19.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.915 on 504 degrees of freedom
## Multiple R-squared: 0.4358, Adjusted R-squared: 0.4347
## F-statistic: 389.3 on 1 and 504 DF, p-value: < 2.2e-16
Given a qualitative variable such as Shelveloc, R generates dummy variables automatically. contrasts()
lm.fit <- lm(Sales ~ . + Income : Advertising + Price : Age, data = Carseats)
summary(lm.fit)
##
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9208 -0.7503 0.0177 0.6754 3.3413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5755654 1.0087470 6.519 2.22e-10 ***
## CompPrice 0.0929371 0.0041183 22.567 < 2e-16 ***
## Income 0.0108940 0.0026044 4.183 3.57e-05 ***
## Advertising 0.0702462 0.0226091 3.107 0.002030 **
## Population 0.0001592 0.0003679 0.433 0.665330
## Price -0.1008064 0.0074399 -13.549 < 2e-16 ***
## ShelveLocGood 4.8486762 0.1528378 31.724 < 2e-16 ***
## ShelveLocMedium 1.9532620 0.1257682 15.531 < 2e-16 ***
## Age -0.0579466 0.0159506 -3.633 0.000318 ***
## Education -0.0208525 0.0196131 -1.063 0.288361
## UrbanYes 0.1401597 0.1124019 1.247 0.213171
## USYes -0.1575571 0.1489234 -1.058 0.290729
## Income:Advertising 0.0007510 0.0002784 2.698 0.007290 **
## Price:Age 0.0001068 0.0001333 0.801 0.423812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared: 0.8761, Adjusted R-squared: 0.8719
## F-statistic: 210 on 13 and 386 DF, p-value: < 2.2e-16
attach(Carseats)
contrasts(ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
Writing and calling functions in R.
LoadLibraries <- function() {
library(ISLR2)
library(MASS)
print("The libraries have been loaded.")
}
LoadLibraries()
## [1] "The libraries have been loaded."
8) a)
Answer -
attach(Auto)
lm.fit = lm(mpg ~ horsepower)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ horsepower)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
8) a) i)
Answer - As value p < 2.2e-16, there is a relationship between the
predictor and the response which is statistically significant.
lm.fit<-lm(mpg~horsepower,data=Auto)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
8) a) ii)
Answer - 60.59% of the variance in mpg is explained by horsepower.
summary(lm.fit)$r.squared
## [1] 0.6059483
summary(lm.fit)$sigma
## [1] 4.905757
summary(lm.fit)$sigma/mean(Auto$mpg)
## [1] 0.2092371
8) a) iii)
Answer - Negative
coefficients(lm.fit)["horsepower"]
## horsepower
## -0.1578447
8) a) iv)
Answer - Prediction interval = 24.467 ± 9.658
predict(lm.fit, data.frame(horsepower = 98), interval = "confidence", level = 0.95)
## fit lwr upr
## 1 24.46708 23.97308 24.96108
8) b)
Answer -
plot(horsepower, mpg)
abline(lm.fit, lwd=3, col="red")
8) c)
Answer -
par(mfrow = c(2 , 2))
plot(lm.fit)
9) a)
Answer -
pairs(Auto, pch = 19)
9) b)
Answer -
Auto$name = NULL
cor(Auto, method="pearson")
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
9) c) i)
Answer - Yes, there is a relation between the predictors and response.
According to the null hypothesis all the coefficients are 0. F-statistic
is used to test the hypothesis. As p-value < 2.2e-16, null hypothesis
is rejected.
lm.fit<-lm(mpg~. , data=Auto)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ ., data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
9) c) ii)
Answer - Displacement, weight, year, origin
9) c) iii)
Answer - mpg value increases when every other predictor is kept
constant. If there is an increase of a unit in year then there will be
increase of 0.7507727 unit in mpg.
coef(lm.fit)["year"]
## year
## 0.7507727
9) d)
Answer - The first graph shows that there is a non-linear relationship
between the response and the predictors. The second graph shows that the
residuals are normally distributed and right skewed. The third graph
shows that the constant variance of error assumption is False. The
fourth graphs shows that there are no leverage points. However, there’s
one potential leverage point.
par(mfrow = c(2,2))
plot(lm.fit,col = "red")
9) e)
Answer - Yes. few interactions are statistically significant as shown by
the Signif. codes/p-values in the summary below For example,
acceleration and origin
lm.fit = lm(mpg ~ .*. + displacement:weight,data=Auto)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ . * . + displacement:weight, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
9) f)
Answer - Among these plots, log(horsepower) seems to be most linear
looking.
par(mfrow = c(2, 2))
plot(log(Auto$horsepower), Auto$mpg)
plot(sqrt(Auto$horsepower), Auto$mpg)
plot((Auto$horsepower)^2, Auto$mpg)
13) a)
Answer -
set.seed(1)
X = rnorm(100, mean = 0, sd = 1)
13) b)
Answer -
eps = rnorm(100, mean = 0, sd = sqrt(0.25))
13) c)
Answer - Lenght is 100. Beta 0 is -1 and Beta 1 is 0.5.
Y = -1 + 0.5 * X + eps
length(Y)
## [1] 100
13) d)
Answer - X and Y have a positive correlation and they have a linear
relationship
plot(X, Y,col="red")
13) e)
Answer - The estimated values are very close, the presence of
irreducible error prevents a perfect fit.
model1 = lm(Y ~ X)
coef(model1)
## (Intercept) X
## -1.0188463 0.4994698
summary(model1)$adj.r.squared
## [1] 0.4619164
13) f)
Answer -
plot(X, Y, typ = "p")
abline(model1, col = "red", lty = "dotted")
abline(a = -1, b = 0.5, col = "blue", lty = "solid")
legend("bottomright", legend = c("actual", "estimate"), col = c("blue", "red"), lty = c("solid", "dotted"))
13) g)
Answer - No, 2 coefficient is not statistically significant.
i.e.Pr(>|t|) is 1.638275e-01 (which is greater than 0.05, a common
rejection threshold). As the Adjusted R squared estimate is slightly
larger which implies this is a better model.
model2 = lm(Y ~ poly(X, 2))
summary(model2)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9644604 0.04790161 -20.134193 2.046895e-36
## poly(X, 2)1 4.4637471 0.47901614 9.318573 3.972802e-15
## poly(X, 2)2 -0.6720309 0.47901614 -1.402940 1.638275e-01
summary(model2)$adj.r.squared
## [1] 0.4671806
13) h)
Answer - The model is more accurate as noise has decreased.
eps3 = rnorm(100, mean = 0, sd = 0.001)
y3 = -1 + 0.5 * X + eps3
model3 = lm(y3 ~ X)
coef(model3)
## (Intercept) X
## -0.9999726 0.5000212
summary(model3)$adj.r.squared
## [1] 0.9999946
plot(X, y3, typ = "p")
abline(model3, col = "red", lty = "dotted")
abline(a = -1, b = 0.5, col = "blue", lty = "solid")
legend("bottomright", legend = c("actual", "estimate"), col = c("blue", "red"), lty = c("solid", "dotted"))
13) i)
Answer - The model is less accurate as noise has increased.
eps4 = rnorm(100, mean = 0, sd = 50)
y4 = -1 + 0.5 * X + eps4
model4 = lm(y4 ~ X)
coef(model4)
## (Intercept) X
## 1.883268 -2.284303
summary(model4)$adj.r.squared
## [1] -0.008472893
plot(X, y4, typ = "p")
abline(model4, col = "red", lty = "dotted")
abline(a = -1, b = 0.5, col = "blue", lty = "solid")
legend("bottomright", legend = c("actual", "estimate"), col = c("blue", "red"), lty = c("solid", "dotted"))
13) i)
Answer - Noisier data produces larger confidence intervals for the
coefficients.
confint(model1)
## 2.5 % 97.5 %
## (Intercept) -1.1150804 -0.9226122
## X 0.3925794 0.6063602
confint(model3)
## 2.5 % 97.5 %
## (Intercept) -1.0001804 -0.9997648
## X 0.4997904 0.5002520
confint(model4)
## 2.5 % 97.5 %
## (Intercept) -8.066997 11.83353
## X -13.336396 8.76779
14) a)
Answer = y = 2 + 2x1 + 0.3x2 where coefficients are 2, 2, 0.3
set.seed(1)
x1 = runif(100)
x2 = 0.5*x1 + rnorm(100)/10
y = 2 + 2*x1 + 0.3*x2 + rnorm(100)
14) b)
Answer =
cor(x1, x2)
## [1] 0.8351212
plot(x1, x2)
14) c)
Answer = The coefficients are 2.131, 1.44, 1.01. Checking with the
p-values of our parameters, we cannot reject the hypothesis for beta 1
or beta 2 with 95% confidence. The results indicate nontrivial bias
between all of the parameters.
model = lm(y ~ x1 + x2)
summary(model)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8311 -0.7273 -0.0537 0.6338 2.3359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1305 0.2319 9.188 7.61e-15 ***
## x1 1.4396 0.7212 1.996 0.0487 *
## x2 1.0097 1.1337 0.891 0.3754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.056 on 97 degrees of freedom
## Multiple R-squared: 0.2088, Adjusted R-squared: 0.1925
## F-statistic: 12.8 on 2 and 97 DF, p-value: 1.164e-05
14) d)
Answer = We can confidently reject the null hypothesis now.
model = lm(y ~ x1)
summary(model)
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.89495 -0.66874 -0.07785 0.59221 2.45560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1124 0.2307 9.155 8.27e-15 ***
## x1 1.9759 0.3963 4.986 2.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.055 on 98 degrees of freedom
## Multiple R-squared: 0.2024, Adjusted R-squared: 0.1942
## F-statistic: 24.86 on 1 and 98 DF, p-value: 2.661e-06
14) e)
Answer = We can confidently reject the null hypothesis now.
model = lm(y ~ x2)
summary(model)
##
## Call:
## lm(formula = y ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.62687 -0.75156 -0.03598 0.72383 2.44890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3899 0.1949 12.26 < 2e-16 ***
## x2 2.8996 0.6330 4.58 1.37e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.072 on 98 degrees of freedom
## Multiple R-squared: 0.1763, Adjusted R-squared: 0.1679
## F-statistic: 20.98 on 1 and 98 DF, p-value: 1.366e-05
14) f)
Answer - The results do not contradict each other as the predictors are
highly correlated and are collinear. Due to collinearity its difficult
to determine how each predictor is associated with the response.
14) g)
Answer - In the model with two predictors, the last point is a
high-leverage point. In the model with x1 as sole predictor, the last
point is an outlier. In the model with x2 as sole predictor, the last
point is a high leverage point.
x1 <- c(x1, 0.1)
x2 <- c(x2, 0.8)
y <- c(y, 6)
fit1 <- lm(y ~ x1 + x2)
fit2 <- lm(y ~ x1)
fit3 <- lm(y ~ x2)
summary(fit1)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.73348 -0.69318 -0.05263 0.66385 2.30619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2267 0.2314 9.624 7.91e-16 ***
## x1 0.5394 0.5922 0.911 0.36458
## x2 2.5146 0.8977 2.801 0.00614 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.075 on 98 degrees of freedom
## Multiple R-squared: 0.2188, Adjusted R-squared: 0.2029
## F-statistic: 13.72 on 2 and 98 DF, p-value: 5.564e-06
summary(fit2)
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8897 -0.6556 -0.0909 0.5682 3.5665
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2569 0.2390 9.445 1.78e-15 ***
## x1 1.7657 0.4124 4.282 4.29e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.111 on 99 degrees of freedom
## Multiple R-squared: 0.1562, Adjusted R-squared: 0.1477
## F-statistic: 18.33 on 1 and 99 DF, p-value: 4.295e-05
summary(fit3)
##
## Call:
## lm(formula = y ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.64729 -0.71021 -0.06899 0.72699 2.38074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3451 0.1912 12.264 < 2e-16 ***
## x2 3.1190 0.6040 5.164 1.25e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.074 on 99 degrees of freedom
## Multiple R-squared: 0.2122, Adjusted R-squared: 0.2042
## F-statistic: 26.66 on 1 and 99 DF, p-value: 1.253e-06
plot(fit1)
plot(fit2)
plot(fit3)